#loading the required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(skimr)
library(DataExplorer)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(leaps)
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(DiagrammeR)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
“Part 1: Read, Explore, and Clean Data”
#load dataset
dataset <- read.csv("Data.csv")
head(dataset)
## DATE UNRATE... CONSUMER.CONF.INDEX PPI.CONST.MAT. CPIALLITEMS
## 1 01-05-2022 3.6 106.4 352.857 123.3228
## 2 01-04-2022 3.6 107.3 343.730 121.9782
## 3 01-03-2022 3.6 107.2 345.852 121.3010
## 4 01-02-2022 3.8 110.5 343.583 119.7028
## 5 01-01-2022 4.0 113.8 345.742 118.6193
## 6 01-12-2021 3.9 115.8 335.032 117.6295
## INFLATION... MORTGAGE.INT..MONTHLY.AVG... MED.HOUSEHOLD.INCOME
## 1 8.581511 5.2300 NA
## 2 8.258629 4.9825 NA
## 3 8.542456 4.1720 NA
## 4 7.871064 3.7625 NA
## 5 7.479872 3.4450 NA
## 6 7.036403 3.0980 NA
## CORP..BOND.YIELD... MONTHLY.HOME.SUPPLY X..SHARE.OF.WORKING.POPULATION
## 1 4.13 8.4 NA
## 2 3.76 8.4 NA
## 3 3.43 7.0 NA
## 4 3.25 6.0 NA
## 5 2.93 5.7 NA
## 6 2.65 5.6 64.92413
## GDP.PER.CAPITA QUARTERLY.REAL.GDP QUARTERLY.GDP.GROWTH.RATE.... CSUSHPISA
## 1 74737 19699.47 -0.1442271 120.724
## 2 74737 19699.47 -0.1442271 121.813
## 3 73289 19727.92 -0.3956925 122.888
## 4 73289 19727.92 -0.3956925 123.831
## 5 73289 19727.92 -0.3956925 124.780
## 6 72171 19806.29 1.6807783 125.734
#check dimensions of data set
dim(dataset)
## [1] 241 15
# Get the structure of data set(We could have also used glimpse() here but structure function is easier to read)
str(dataset)
## 'data.frame': 241 obs. of 15 variables:
## $ DATE : chr "01-05-2022" "01-04-2022" "01-03-2022" "01-02-2022" ...
## $ UNRATE... : num 3.6 3.6 3.6 3.8 4 3.9 4.2 4.6 4.7 5.2 ...
## $ CONSUMER.CONF.INDEX : num 106 107 107 110 114 ...
## $ PPI.CONST.MAT. : num 353 344 346 344 346 ...
## $ CPIALLITEMS : num 123 122 121 120 119 ...
## $ INFLATION... : num 8.58 8.26 8.54 7.87 7.48 ...
## $ MORTGAGE.INT..MONTHLY.AVG... : num 5.23 4.98 4.17 3.76 3.44 ...
## $ MED.HOUSEHOLD.INCOME : int NA NA NA NA NA NA NA NA NA NA ...
## $ CORP..BOND.YIELD... : num 4.13 3.76 3.43 3.25 2.93 2.65 2.62 2.68 2.53 2.55 ...
## $ MONTHLY.HOME.SUPPLY : num 8.4 8.4 7 6 5.7 5.6 6.2 6.9 6.1 6.5 ...
## $ X..SHARE.OF.WORKING.POPULATION: num NA NA NA NA NA ...
## $ GDP.PER.CAPITA : int 74737 74737 73289 73289 73289 72171 72171 72171 69824 69824 ...
## $ QUARTERLY.REAL.GDP : num 19699 19699 19728 19728 19728 ...
## $ QUARTERLY.GDP.GROWTH.RATE.... : num -0.144 -0.144 -0.396 -0.396 -0.396 ...
## $ CSUSHPISA : num 121 122 123 124 125 ...
#summary of data
summary(dataset)
## DATE UNRATE... CONSUMER.CONF.INDEX PPI.CONST.MAT.
## Length:241 Min. : 3.500 Min. : 25.00 Min. :143.8
## Class :character 1st Qu.: 4.700 1st Qu.: 70.40 1st Qu.:183.3
## Mode :character Median : 5.600 Median : 94.50 Median :206.2
## Mean : 6.075 Mean : 90.81 Mean :206.9
## 3rd Qu.: 7.300 3rd Qu.:108.20 3rd Qu.:221.7
## Max. :14.700 Max. :138.40 Max. :352.9
##
## CPIALLITEMS INFLATION... MORTGAGE.INT..MONTHLY.AVG...
## Min. : 75.86 Min. :-2.097 Min. :2.684
## 1st Qu.: 87.72 1st Qu.: 1.464 1st Qu.:3.803
## Median : 96.82 Median : 2.071 Median :4.457
## Mean : 95.54 Mean : 2.296 Mean :4.698
## 3rd Qu.:103.26 3rd Qu.: 2.970 3rd Qu.:5.812
## Max. :123.32 Max. : 8.582 Max. :6.806
##
## MED.HOUSEHOLD.INCOME CORP..BOND.YIELD... MONTHLY.HOME.SUPPLY
## Min. :42409 Min. :2.140 Min. : 3.300
## 1st Qu.:49007 1st Qu.:3.690 1st Qu.: 4.600
## Median :50303 Median :4.340 Median : 5.500
## Mean :53274 Mean :4.471 Mean : 5.974
## 3rd Qu.:59039 3rd Qu.:5.410 3rd Qu.: 6.700
## Max. :68703 Max. :6.750 Max. :12.200
## NA's :17
## X..SHARE.OF.WORKING.POPULATION GDP.PER.CAPITA QUARTERLY.REAL.GDP
## Min. :64.92 Min. :37860 Min. :13477
## 1st Qu.:65.62 1st Qu.:46977 1st Qu.:15305
## Median :66.74 Median :51554 Median :16254
## Mean :66.41 Mean :52896 Mean :16536
## 3rd Qu.:67.13 3rd Qu.:58745 3rd Qu.:17897
## Max. :67.30 Max. :74737 Max. :19806
## NA's :5
## QUARTERLY.GDP.GROWTH.RATE.... CSUSHPISA
## Min. :-8.9373 Min. :120.7
## 1st Qu.: 0.2936 1st Qu.:147.4
## Median : 0.5800 Median :169.8
## Mean : 0.4901 Mean :175.3
## 3rd Qu.: 0.8339 3rd Qu.:189.7
## Max. : 7.5475 Max. :304.8
##
#numerical characteristics from the summary, along with missing values, additional quantile data, and an inline histogram for each variable
skimmed_dataset = skim(dataset)
skimmed_dataset
| Name | dataset |
| Number of rows | 241 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| DATE | 0 | 1 | 10 | 10 | 0 | 241 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| UNRATE… | 0 | 1.00 | 6.07 | 1.99 | 3.50 | 4.70 | 5.60 | 7.30 | 14.70 | ▇▃▃▁▁ |
| CONSUMER.CONF.INDEX | 0 | 1.00 | 90.81 | 25.87 | 25.00 | 70.40 | 94.50 | 108.20 | 138.40 | ▁▅▆▇▅ |
| PPI.CONST.MAT. | 0 | 1.00 | 206.95 | 40.48 | 143.80 | 183.30 | 206.20 | 221.70 | 352.86 | ▅▇▂▁▁ |
| CPIALLITEMS | 0 | 1.00 | 95.54 | 11.09 | 75.86 | 87.72 | 96.82 | 103.26 | 123.32 | ▅▆▇▅▁ |
| INFLATION… | 0 | 1.00 | 2.30 | 1.64 | -2.10 | 1.46 | 2.07 | 2.97 | 8.58 | ▁▇▆▁▁ |
| MORTGAGE.INT..MONTHLY.AVG… | 0 | 1.00 | 4.70 | 1.12 | 2.68 | 3.80 | 4.46 | 5.81 | 6.81 | ▃▇▅▃▅ |
| MED.HOUSEHOLD.INCOME | 17 | 0.93 | 53273.98 | 7475.32 | 42409.00 | 49007.25 | 50303.00 | 59039.00 | 68703.00 | ▅▇▃▃▂ |
| CORP..BOND.YIELD… | 0 | 1.00 | 4.47 | 1.08 | 2.14 | 3.69 | 4.34 | 5.41 | 6.75 | ▃▆▃▇▂ |
| MONTHLY.HOME.SUPPLY | 0 | 1.00 | 5.97 | 1.90 | 3.30 | 4.60 | 5.50 | 6.70 | 12.20 | ▇▇▃▁▁ |
| X..SHARE.OF.WORKING.POPULATION | 5 | 0.98 | 66.41 | 0.80 | 64.92 | 65.62 | 66.74 | 67.13 | 67.30 | ▃▁▃▃▇ |
| GDP.PER.CAPITA | 0 | 1.00 | 52896.08 | 8840.59 | 37860.00 | 46977.00 | 51554.00 | 58745.00 | 74737.00 | ▅▇▅▃▁ |
| QUARTERLY.REAL.GDP | 0 | 1.00 | 16536.01 | 1708.44 | 13477.36 | 15304.52 | 16253.73 | 17896.62 | 19806.29 | ▃▇▅▃▅ |
| QUARTERLY.GDP.GROWTH.RATE…. | 0 | 1.00 | 0.49 | 1.45 | -8.94 | 0.29 | 0.58 | 0.83 | 7.55 | ▁▁▇▂▁ |
| CSUSHPISA | 0 | 1.00 | 175.31 | 36.78 | 120.72 | 147.40 | 169.81 | 189.71 | 304.83 | ▇▇▃▁▁ |
#DataExplorer::create_report(dataset)
‘Part 2: manipulate, wrangle, visualise data’
#Time series plot of inflation.
dataset$DATE <- as.Date(dataset$DATE, format="%d-%m-%Y")
timeplot <- ggplot(dataset, aes(x=DATE, y=INFLATION...)) +
geom_line()
timeplot
temp <- data.frame(
month = as.numeric(format(dataset$DATE, format = "%m")),
year = as.numeric(format(dataset$DATE, format = "%Y")))
#Splitting the char variable date into numerical variables month and year and creading a new dataset
#Removing Date variable from the dataset:
processedDataset <- dataset[-c(1)]
processedDataset <- cbind(temp,processedDataset)
#setting seed value for reproducibility
set.seed(1902)
#splitting data set into test and train set (70:30 split)
subset <- sample(nrow(processedDataset), nrow(processedDataset) * 0.7,replace=FALSE)
train <- processedDataset[subset, ]
nrow(train) / nrow(processedDataset)
## [1] 0.6970954
test <- processedDataset[-subset, ]
nrow(test) / nrow(processedDataset)
## [1] 0.3029046
dim(train)
## [1] 168 16
dim(test)
## [1] 73 16
#filling missing values in the training and test set(imputation using KNN)
train <-kNN(train)
test <-kNN(test)
# additional variables are created after performing knn imputation, Step to remove these extra variables:
train = subset(train, select = c(1:(ncol(train)/2)))
test = subset(test, select = c(1:(ncol(test)/2)))
skim(train)
| Name | train |
| Number of rows | 168 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| month | 0 | 1 | 6.63 | 3.55 | 1.00 | 4.00 | 6.50 | 10.00 | 12.00 | ▇▅▅▅▇ |
| year | 0 | 1 | 2010.95 | 5.79 | 2002.00 | 2006.00 | 2010.00 | 2016.00 | 2022.00 | ▇▆▅▅▃ |
| UNRATE… | 0 | 1 | 6.07 | 1.81 | 3.50 | 4.70 | 5.60 | 7.30 | 9.90 | ▆▇▂▂▃ |
| CONSUMER.CONF.INDEX | 0 | 1 | 88.70 | 25.95 | 25.00 | 70.05 | 92.75 | 106.85 | 135.70 | ▂▅▅▇▅ |
| PPI.CONST.MAT. | 0 | 1 | 201.87 | 39.64 | 143.80 | 179.80 | 201.10 | 215.35 | 345.85 | ▅▇▂▁▁ |
| CPIALLITEMS | 0 | 1 | 93.80 | 10.99 | 75.86 | 85.11 | 92.62 | 101.40 | 121.30 | ▇▇▇▅▂ |
| INFLATION… | 0 | 1 | 2.36 | 1.55 | -2.10 | 1.51 | 2.08 | 3.16 | 8.54 | ▁▇▆▁▁ |
| MORTGAGE.INT..MONTHLY.AVG… | 0 | 1 | 4.83 | 1.14 | 2.68 | 3.91 | 4.61 | 5.92 | 6.81 | ▃▇▅▅▆ |
| MED.HOUSEHOLD.INCOME | 0 | 1 | 53063.36 | 7873.49 | 42409.00 | 48201.00 | 50303.00 | 59039.00 | 68703.00 | ▅▇▃▃▃ |
| CORP..BOND.YIELD… | 0 | 1 | 4.63 | 1.07 | 2.25 | 3.80 | 4.88 | 5.47 | 6.75 | ▂▆▃▇▂ |
| MONTHLY.HOME.SUPPLY | 0 | 1 | 5.98 | 1.95 | 3.30 | 4.50 | 5.50 | 6.75 | 11.60 | ▇▇▃▂▂ |
| X..SHARE.OF.WORKING.POPULATION | 0 | 1 | 66.49 | 0.78 | 64.92 | 65.88 | 66.74 | 67.13 | 67.30 | ▃▁▂▃▇ |
| GDP.PER.CAPITA | 0 | 1 | 51593.26 | 8718.61 | 37860.00 | 46089.00 | 49256.00 | 57493.00 | 73289.00 | ▅▇▃▃▂ |
| QUARTERLY.REAL.GDP | 0 | 1 | 16288.91 | 1704.46 | 13477.36 | 15216.65 | 15792.77 | 17645.06 | 19806.29 | ▃▇▃▃▃ |
| QUARTERLY.GDP.GROWTH.RATE…. | 0 | 1 | 0.61 | 1.00 | -2.18 | 0.32 | 0.59 | 0.85 | 7.55 | ▁▇▁▁▁ |
| CSUSHPISA | 0 | 1 | 180.37 | 39.58 | 122.89 | 148.07 | 173.74 | 201.07 | 304.83 | ▆▇▃▁▁ |
skim(test)
| Name | test |
| Number of rows | 73 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| month | 0 | 1 | 6.18 | 3.23 | 1.00 | 3.00 | 6.00 | 9.00 | 12.00 | ▇▅▆▆▅ |
| year | 0 | 1 | 2014.00 | 5.35 | 2003.00 | 2010.00 | 2015.00 | 2019.00 | 2022.00 | ▃▅▇▇▇ |
| UNRATE… | 0 | 1 | 6.09 | 2.36 | 3.50 | 4.40 | 5.40 | 7.50 | 14.70 | ▇▃▂▁▁ |
| CONSUMER.CONF.INDEX | 0 | 1 | 95.66 | 25.19 | 37.70 | 79.70 | 98.00 | 111.20 | 138.40 | ▂▅▇▇▆ |
| PPI.CONST.MAT. | 0 | 1 | 218.64 | 40.23 | 147.00 | 195.90 | 213.30 | 233.80 | 352.86 | ▃▇▅▁▁ |
| CPIALLITEMS | 0 | 1 | 99.55 | 10.30 | 77.59 | 92.47 | 100.35 | 106.69 | 123.32 | ▂▅▇▆▂ |
| INFLATION… | 0 | 1 | 2.14 | 1.85 | -1.29 | 1.17 | 1.97 | 2.70 | 8.58 | ▂▇▃▁▁ |
| MORTGAGE.INT..MONTHLY.AVG… | 0 | 1 | 4.39 | 1.02 | 2.77 | 3.69 | 4.17 | 4.95 | 6.70 | ▃▇▅▂▂ |
| MED.HOUSEHOLD.INCOME | 0 | 1 | 57087.07 | 7959.40 | 43318.00 | 50233.00 | 56516.00 | 63179.00 | 68703.00 | ▃▇▆▆▇ |
| CORP..BOND.YIELD… | 0 | 1 | 4.12 | 1.03 | 2.14 | 3.50 | 3.95 | 5.05 | 6.01 | ▃▅▇▃▅ |
| MONTHLY.HOME.SUPPLY | 0 | 1 | 5.96 | 1.77 | 3.40 | 4.90 | 5.50 | 6.70 | 12.20 | ▆▇▂▁▁ |
| X..SHARE.OF.WORKING.POPULATION | 0 | 1 | 66.13 | 0.85 | 64.92 | 65.35 | 66.11 | 66.92 | 67.30 | ▇▁▅▃▇ |
| GDP.PER.CAPITA | 0 | 1 | 55894.36 | 8434.15 | 39752.00 | 49256.00 | 56018.00 | 62786.00 | 74737.00 | ▂▇▇▆▂ |
| QUARTERLY.REAL.GDP | 0 | 1 | 17104.67 | 1587.57 | 13970.16 | 15769.91 | 17258.21 | 18679.60 | 19806.29 | ▃▇▇▃▇ |
| QUARTERLY.GDP.GROWTH.RATE…. | 0 | 1 | 0.21 | 2.15 | -8.94 | 0.17 | 0.57 | 0.79 | 7.55 | ▁▁▇▂▁ |
| CSUSHPISA | 0 | 1 | 163.65 | 26.03 | 120.72 | 144.58 | 158.23 | 180.91 | 245.80 | ▇▇▆▂▁ |
#Visualisation of Data:
pairs(processedDataset)
boxplot(processedDataset)
M = cor(rbind(test,train))
corrplot(M, method = 'number',tl.cex=0.5)
#Standardizing/Scaling data
train_unscaled <- train
train<-scale(train, center = TRUE, scale = TRUE)
train<-as.data.frame(train)
test_unscaled<-test
test<-scale(test, center = TRUE, scale = TRUE)
test<-as.data.frame(test)
‘Part 3: Running Models’
#Using forward selection to identify significant parameters
set.seed(1902)
intercept_only <- lm(INFLATION... ~ 1, data=train)
all <- lm(INFLATION... ~ ., data=train)
forward <- step(intercept_only, direction='forward', scope=formula(all), trace=0)
forward$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 167 167.00000 0.9970119
## 2 + PPI.CONST.MAT. -1 16.5568698 166 150.44313 -14.5436450
## 3 + year -1 70.3561888 165 80.08694 -118.4629962
## 4 + QUARTERLY.REAL.GDP -1 12.9654830 164 67.12146 -146.1333120
## 5 + month -1 3.8844731 163 63.23699 -154.1485395
## 6 + CPIALLITEMS -1 6.3584614 162 56.87852 -169.9517526
## 7 + MONTHLY.HOME.SUPPLY -1 7.7652427 161 49.11328 -192.6121941
## 8 + CSUSHPISA -1 2.3440123 160 46.76927 -198.8279237
## 9 + GDP.PER.CAPITA -1 3.1841301 159 43.58514 -208.6736205
## 10 + CONSUMER.CONF.INDEX -1 1.2780463 158 42.30709 -211.6735520
## 11 + UNRATE... -1 0.8468738 157 41.46022 -213.0705728
## 12 + MED.HOUSEHOLD.INCOME -1 0.5351031 156 40.92512 -213.2529659
forward$coefficients
## (Intercept) PPI.CONST.MAT. year
## -1.275481e-14 1.215319e+00 -7.430751e+00
## QUARTERLY.REAL.GDP month CPIALLITEMS
## 6.457188e+00 -3.846131e-01 4.599175e+00
## MONTHLY.HOME.SUPPLY CSUSHPISA GDP.PER.CAPITA
## -3.238516e-01 6.816169e-01 -3.551850e+00
## CONSUMER.CONF.INDEX UNRATE... MED.HOUSEHOLD.INCOME
## -2.457044e-01 2.579252e-01 -4.553320e-01
# From the above data we see that the following predictors are the most statistically significant
RSQUARE = function(y_actual,y_predict){
cor(y_actual,y_predict)^2
}
#Linear Regression Model
set.seed(1902)
linearModel = lm(INFLATION... ~ PPI.CONST.MAT. + CPIALLITEMS + MORTGAGE.INT..MONTHLY.AVG... + CORP..BOND.YIELD... + MED.HOUSEHOLD.INCOME + CSUSHPISA + MONTHLY.HOME.SUPPLY + X..SHARE.OF.WORKING.POPULATION, data = train)
summary(linearModel)
##
## Call:
## lm(formula = INFLATION... ~ PPI.CONST.MAT. + CPIALLITEMS + MORTGAGE.INT..MONTHLY.AVG... +
## CORP..BOND.YIELD... + MED.HOUSEHOLD.INCOME + CSUSHPISA +
## MONTHLY.HOME.SUPPLY + X..SHARE.OF.WORKING.POPULATION, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.38890 -0.29445 0.02902 0.34231 1.30279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.733e-15 4.834e-02 0.000 1.000000
## PPI.CONST.MAT. 1.715e+00 1.756e-01 9.769 < 2e-16 ***
## CPIALLITEMS 4.643e-01 3.837e-01 1.210 0.228018
## MORTGAGE.INT..MONTHLY.AVG... 1.203e+00 2.084e-01 5.772 3.99e-08 ***
## CORP..BOND.YIELD... -8.827e-01 2.404e-01 -3.671 0.000329 ***
## MED.HOUSEHOLD.INCOME -7.357e-01 3.267e-01 -2.252 0.025673 *
## CSUSHPISA 6.262e-01 2.170e-01 2.886 0.004450 **
## MONTHLY.HOME.SUPPLY -2.471e-01 9.888e-02 -2.499 0.013456 *
## X..SHARE.OF.WORKING.POPULATION 6.712e-01 3.127e-01 2.147 0.033346 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6265 on 159 degrees of freedom
## Multiple R-squared: 0.6263, Adjusted R-squared: 0.6075
## F-statistic: 33.3 on 8 and 159 DF, p-value: < 2.2e-16
linearPred <- predict(linearModel, test)
lm_rmse <- rmse(test$INFLATION..., linearPred) #0.5718129
AIC(linearModel)
## [1] 330.4117
BIC(linearModel)
## [1] 361.6513
#Ridge Regression Model
set.seed(1902)
#define response variable
y <- train$INFLATION...
#define matrix of predictor variables
x <- data.matrix(train[, c('PPI.CONST.MAT.','CPIALLITEMS', 'MORTGAGE.INT..MONTHLY.AVG...', 'CORP..BOND.YIELD...', 'MED.HOUSEHOLD.INCOME', 'CSUSHPISA', 'MONTHLY.HOME.SUPPLY', 'X..SHARE.OF.WORKING.POPULATION')])
xtest <- data.matrix(test[, c('PPI.CONST.MAT.','CPIALLITEMS', 'MORTGAGE.INT..MONTHLY.AVG...', 'CORP..BOND.YIELD...', 'MED.HOUSEHOLD.INCOME', 'CSUSHPISA', 'MONTHLY.HOME.SUPPLY', 'X..SHARE.OF.WORKING.POPULATION')])
ytest <- test$INFLATION...
#perform k-fold cross-validation to find optimal lambda value
crossModelRidge <- cv.glmnet(x, y, alpha = 0)
lambdas <- 10^seq(2, -3, by = -.1)
ridgeRegression <- glmnet(x, y, alpha = 0,family = 'gaussian', lambda = lambdas)
summary(ridgeRegression)
## Length Class Mode
## a0 51 -none- numeric
## beta 408 dgCMatrix S4
## df 51 -none- numeric
## dim 2 -none- numeric
## lambda 51 -none- numeric
## dev.ratio 51 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 6 -none- call
## nobs 1 -none- numeric
#find optimal lambda value that minimizes test MSE
bestLambda <- crossModelRidge$lambda.min
bestLambda
## [1] 0.03139312
#produce plot of test MSE by lambda value
plot(crossModelRidge)
#find coefficients of best model
bestModel <- glmnet(x, y, alpha = 0, lambda = bestLambda)
coef(bestModel)
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.013068e-15
## PPI.CONST.MAT. 1.350516e+00
## CPIALLITEMS 6.493612e-02
## MORTGAGE.INT..MONTHLY.AVG... 8.674415e-01
## CORP..BOND.YIELD... -3.103988e-01
## MED.HOUSEHOLD.INCOME -3.238406e-01
## CSUSHPISA 2.282555e-01
## MONTHLY.HOME.SUPPLY -2.236387e-01
## X..SHARE.OF.WORKING.POPULATION 3.424238e-01
plot(ridgeRegression, xvar = "lambda")
#use fitted best model to make predictions
yPredicted <- predict(ridgeRegression, s = bestLambda, newx = xtest)
ridge_rmse <-rmse(ytest,yPredicted) #0.4914431
#Lasso Regression Model
set.seed(1902)
#perform k-fold cross-validation to find optimal lambda value
crossModel2 <- cv.glmnet(x, y, alpha = 1)
#find optimal lambda value that minimizes test MSE
bestLambda2 <- crossModel2$lambda.min
bestLambda2
## [1] 0.0002430655
#produce plot of test MSE by lambda value
plot(crossModel2)
#find coefficients of best model
bestModel2 <- glmnet(x, y, alpha = 1, lambda = bestLambda2)
coef(bestModel2)
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -3.973851e-15
## PPI.CONST.MAT. 1.712321e+00
## CPIALLITEMS 4.500882e-01
## MORTGAGE.INT..MONTHLY.AVG... 1.199470e+00
## CORP..BOND.YIELD... -8.769051e-01
## MED.HOUSEHOLD.INCOME -7.299107e-01
## CSUSHPISA 6.174392e-01
## MONTHLY.HOME.SUPPLY -2.455779e-01
## X..SHARE.OF.WORKING.POPULATION 6.626533e-01
#use fitted best model to make predictions
yPredicted2 <- predict(bestModel2, s = bestLambda2, newx = xtest)
lasso_rmse <-rmse(ytest,yPredicted2) #0.5697695
#Random Forest Model
set.seed(1902)
randomForestModel <- randomForest(INFLATION... ~ PPI.CONST.MAT. + CPIALLITEMS + MORTGAGE.INT..MONTHLY.AVG... + CORP..BOND.YIELD... + MED.HOUSEHOLD.INCOME + CSUSHPISA + MONTHLY.HOME.SUPPLY + X..SHARE.OF.WORKING.POPULATION, data = train, mtry = 3, importance = TRUE, na.action = na.omit)
randomForestModel
##
## Call:
## randomForest(formula = INFLATION... ~ PPI.CONST.MAT. + CPIALLITEMS + MORTGAGE.INT..MONTHLY.AVG... + CORP..BOND.YIELD... + MED.HOUSEHOLD.INCOME + CSUSHPISA + MONTHLY.HOME.SUPPLY + X..SHARE.OF.WORKING.POPULATION, data = train, mtry = 3, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.1414103
## % Var explained: 85.77
plot(randomForestModel)
summary(randomForestModel)
## Length Class Mode
## call 6 -none- call
## type 1 -none- character
## predicted 168 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 168 -none- numeric
## importance 16 -none- numeric
## importanceSD 8 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 168 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
rfPredictions = predict(randomForestModel, test)
mean((rfPredictions - test$INFLATION...)^2)
## [1] 0.4569684
rf_rmse<-rmse(test$INFLATION..., rfPredictions) #0.6759944
#XGBOOST MODEL
set.seed(1902)
train_x = data.matrix(train[, -7])
train_y = train[,7]
test_x = data.matrix(test[, -7])
test_y = test[, 7]
xgb_train = xgb.DMatrix(data = train_x, label = train_y)
xgb_test = xgb.DMatrix(data = test_x, label = test_y)
watchlist = list(train=xgb_train, test=xgb_test)
model = xgb.train(data = xgb_train, max.depth = 3, watchlist=watchlist, nrounds = 70) #From the output we can see that the minimum testing RMSE is achieved at 26 rounds.
## [1] train-rmse:0.897246 test-rmse:0.866098
## [2] train-rmse:0.739680 test-rmse:0.767033
## [3] train-rmse:0.640073 test-rmse:0.680113
## [4] train-rmse:0.571949 test-rmse:0.623681
## [5] train-rmse:0.523814 test-rmse:0.590589
## [6] train-rmse:0.475864 test-rmse:0.624883
## [7] train-rmse:0.446896 test-rmse:0.617974
## [8] train-rmse:0.395943 test-rmse:0.566077
## [9] train-rmse:0.360114 test-rmse:0.552384
## [10] train-rmse:0.331065 test-rmse:0.551328
## [11] train-rmse:0.308852 test-rmse:0.560815
## [12] train-rmse:0.286277 test-rmse:0.575717
## [13] train-rmse:0.272002 test-rmse:0.570884
## [14] train-rmse:0.263641 test-rmse:0.570806
## [15] train-rmse:0.255383 test-rmse:0.570363
## [16] train-rmse:0.245911 test-rmse:0.568605
## [17] train-rmse:0.237166 test-rmse:0.567948
## [18] train-rmse:0.225343 test-rmse:0.566148
## [19] train-rmse:0.212451 test-rmse:0.568949
## [20] train-rmse:0.198938 test-rmse:0.567187
## [21] train-rmse:0.194421 test-rmse:0.564668
## [22] train-rmse:0.182341 test-rmse:0.559164
## [23] train-rmse:0.169826 test-rmse:0.554380
## [24] train-rmse:0.164237 test-rmse:0.558524
## [25] train-rmse:0.155566 test-rmse:0.556023
## [26] train-rmse:0.148065 test-rmse:0.557349
## [27] train-rmse:0.146544 test-rmse:0.557649
## [28] train-rmse:0.139689 test-rmse:0.560763
## [29] train-rmse:0.134906 test-rmse:0.563738
## [30] train-rmse:0.131989 test-rmse:0.565147
## [31] train-rmse:0.124874 test-rmse:0.562844
## [32] train-rmse:0.122828 test-rmse:0.563370
## [33] train-rmse:0.121044 test-rmse:0.564114
## [34] train-rmse:0.118629 test-rmse:0.564537
## [35] train-rmse:0.114708 test-rmse:0.563925
## [36] train-rmse:0.112203 test-rmse:0.565329
## [37] train-rmse:0.107976 test-rmse:0.565435
## [38] train-rmse:0.104595 test-rmse:0.564920
## [39] train-rmse:0.100645 test-rmse:0.565461
## [40] train-rmse:0.098298 test-rmse:0.567951
## [41] train-rmse:0.094117 test-rmse:0.566983
## [42] train-rmse:0.092052 test-rmse:0.566956
## [43] train-rmse:0.088535 test-rmse:0.566991
## [44] train-rmse:0.085178 test-rmse:0.566165
## [45] train-rmse:0.082362 test-rmse:0.566716
## [46] train-rmse:0.079926 test-rmse:0.566752
## [47] train-rmse:0.078685 test-rmse:0.567498
## [48] train-rmse:0.074283 test-rmse:0.566310
## [49] train-rmse:0.071731 test-rmse:0.566383
## [50] train-rmse:0.070841 test-rmse:0.566452
## [51] train-rmse:0.070058 test-rmse:0.566555
## [52] train-rmse:0.068420 test-rmse:0.566996
## [53] train-rmse:0.067011 test-rmse:0.565867
## [54] train-rmse:0.066064 test-rmse:0.566169
## [55] train-rmse:0.063994 test-rmse:0.564853
## [56] train-rmse:0.062709 test-rmse:0.565801
## [57] train-rmse:0.061567 test-rmse:0.565276
## [58] train-rmse:0.059644 test-rmse:0.567288
## [59] train-rmse:0.058105 test-rmse:0.568088
## [60] train-rmse:0.056176 test-rmse:0.568228
## [61] train-rmse:0.055627 test-rmse:0.568421
## [62] train-rmse:0.054124 test-rmse:0.568500
## [63] train-rmse:0.053086 test-rmse:0.568598
## [64] train-rmse:0.051967 test-rmse:0.569344
## [65] train-rmse:0.049996 test-rmse:0.567794
## [66] train-rmse:0.049632 test-rmse:0.568462
## [67] train-rmse:0.049237 test-rmse:0.567722
## [68] train-rmse:0.048476 test-rmse:0.567442
## [69] train-rmse:0.047141 test-rmse:0.568092
## [70] train-rmse:0.045849 test-rmse:0.566548
final = xgboost(data = xgb_train, max.depth = 3, nrounds = 26, verbose = 0) #we’ll define our final XGBoost model to use 26 rounds
xgb.plot.tree(model = final, trees = 1:3)
pred_y = predict(final, test_x)
mean((test_y - pred_y)^2) #mse
## [1] 0.3106376
MAE(test_y, pred_y) #mae
## [1] 0.4356466
xg_rmse<-rmse(test_y, pred_y) #rmse 0.5573487
#RMSE Comparison for the models.
x= c('lm','ridge','lasso','rf','xgboost')
y= c(lm_rmse,ridge_rmse,lasso_rmse,rf_rmse,xg_rmse)
sample_data <- data.frame(name = c('lm','ridge','lasso','rf','xgboost') ,
value = c(lm_rmse,ridge_rmse,lasso_rmse,rf_rmse,xg_rmse))
plot<-ggplot(sample_data,
aes(name,value)) +
geom_bar(stat = "identity") + xlab("ML Model")+ylab("RMSE Value")
plot
#The plot shows that Ridge regression works best for the following dataset to predict inflation.
#Comparison of actual test values vs values predicted using Ridge Regression Model.
df1 = data.frame(Value_Type =rep (c("Actual"),length(test$INFLATION...)),inflation = test$INFLATION...,year=test_unscaled$year)
df2 = data.frame(Value_Type =rep (c("Predicted"),length(yPredicted)),inflation = yPredicted,year=test_unscaled$year)
df2<- rename(df2,inflation=s1)
final_df = union_all(df1,df2)
plt <- ggplot(data=final_df, aes(x=year, y=inflation, color=Value_Type))+
geom_line()+
ggtitle("Inflation Prediction")
plt